***Clean Entrance/Clearance Data from USACE

clear
cd "H:\My Drive\Boats\ReplicationCode\data\entrance_clearance"

*import delimited "EC`year'_entr.txt"
*save EC`year'_data, replace
*import delimited "EC`year'_clear.txt" , clear
*append using EC`year'_data

*all available years:
*there are 18/19 variables in files for 2007-2011 and 23 variables for 2012-2016

forval year = 2007/2016 {
	clear
	import delimited "EC`year'_entr.txt"
	save EC`year'_data, replace
	import delimited "EC`year'_clear.txt" , clear
	append using EC`year'_data
	erase EC`year'_data.dta
	generate time=`year'
	destring imo, replace force
	*save EC`year'_data, replace
	save "`year'", replace
}

*port name: pww_name applies to 2007-2011 and port_name applies to 2012-2016
forval year = 2007/2011 {
	clear
	use `year'
	rename pww_name port_name
	rename imo imo_upd // so that imo columns match
	rename where_schk where_schedk // so sched_k matches
	save "`year'", replace
}

* import port mappings and convert
clear
import delimited ports_mapping.csv
rename port_agg ECport_agg
rename port_agg2 ECport_agg2
rename port_agg3 ECport_agg3
drop port_name
save ports_mapping , replace
rename port where_port
rename ECport_agg where_port_agg
rename ECport_agg2 where_port_agg2
rename ECport_agg3 where_port_agg3
gen post2009=0
save where_ports_mapping_pre2009 , replace

* creating single where_port map (includes flag for post 2009, which includes 2009)
clear
import delimited where_ports_mapping_post2009.csv
drop where_name
gen post2009=1
*save where_ports_mapping_post2009 , replace
rename port_agg where_port_agg
rename port_agg2 where_port_agg2
rename port_agg3 where_port_agg3
append using where_ports_mapping_pre2009
save where_ports_mapping , replace

* import schedule K codes
* first get region mapping
clear
import delimited country_region_mapping.csv , varnames(1)
save country_region , replace

* first get china regions mapping
clear
import delimited china_port_regions.csv , varnames(1)
drop forport_name ctry_name
duplicates drop 
rename forport_cd where_schedk
save china_region , replace

* get schedule k file
clear
import excel using schedk.xls , first
rename FORPORT_CD where_schedk
destring where_schedk , replace
rename CTRY_NAME country
merge m:1 country using country_region // add region mapping
drop _merge 

* updating groups of ports
gen schedk_port = mod(where_schedk,100)
replace FORPORT_NAME = country + ", all other ports" if schedk_port==0
duplicates drop where_schedk FORPORT_NAME , force
* there are lots of ports with same code, i'm using first by alphabetical
duplicates drop where_schedk , force

* splitting china into south, central, north
merge m:1 where_schedk using china_region // add china regions
replace country = country + " " + china_region if country=="China"

* clean up
drop schedk_port _merge*

* splitting east and west coast canadian ports (except those going to Vancouver ports)
*gen country = country
replace country = "Canada, WC" if where_schedk>=12000 & where_schedk<13000 & country~="Sea"

* splitting Russia to eastern and western
replace country = "Russia, EC" if where_schedk>=46222 & where_schedk<=46239 
replace region = "Asia" if country == "Russia, EC" 

save schedk , replace				// save

*append yearly .dta files
clear
use 2007
forval year = 2008/2016{
	append using `year', force
	erase `year'.dta
} 
erase 2007.dta

mvdecode where_port where_schedk , mv(9999 99999 0)

* merging in port identifiers consistent with AIS
merge m:1 port using ports_mapping 
drop _merge

* now merging where port (domestic)
gen post2009=0
replace post2009=1 if time>=2009
merge m:1 where_port post2009 using where_ports_mapping
drop if _merge==2
*drop _merge

* MERGE SCHED K
merge m:1 where_schedk using schedk , gen(_merge_schedk)
drop if _merge_schedk==2
drop _merge*

* creating variable that merges domestic and foreign ports with different aggregations
replace where_port_agg=region if where_ind=="F"
replace where_port_agg2=region if where_ind=="F"

replace where_port_agg="N/A" if where_port==. & where_schedk==.
replace where_port_agg2="N/A" if where_port==. & where_schedk==.

* creating variable by port name (foreign and domestic)
replace where_name=FORPORT_NAME if where_ind=="F"
replace where_name="N/A" if where_port==. & where_schedk==.

* these are port designations for fixed effects schemes

* By country, but breaking up US (East Coast, West Coast, AK, HI)
gen ec_country = where_port_agg3 
replace ec_country = country  if where_ind=="F"
replace ec_country = "N/A" if where_port==. & where_schedk==.

* By region, but breaking up US
gen ec_region = where_port_agg3 
replace ec_region = region  if where_ind=="F"
replace ec_region = "N/A" if where_port==. & where_schedk==.

* By foreign port, but breaking up US
gen ec_port = where_port_agg3 
replace ec_port = where_name  if where_ind=="F"
replace ec_port = "N/A" if where_port==. & where_schedk==.

*keep big ships only
tab rig
*1=dry cargo 71%, 2=tanker 10%, 3=tug 9%, 4=barge 6%, 5=tank barge 3%, 6=everything else 0.17%
*keep if rig==1 | rig==2
drop if rig==6| rig==3

gen vessel_str = "Cargo"
replace vessel_str = "Tanker" if inlist(rig,2,5)

* generate dates
gen day=substr(string(ecdate),-2,2)
*gen year=substr(string(ecdate),-3,1)

tostring ecdate, generate(str_ecdate)
replace str_ecdate = "0" + str_ecdate if length(str_ecdate) == 4
gen month = substr(str_ecdate,1,2)
destring day month , replace

gen year = time
drop time

* clean up flag ISO codes (make all caps)
rename flag flag_old
gen flag = upper(flag_old)
drop flag_old

* cleaning up and saving
*drop schedk_port
*rename FORPORT_NAME where_name

drop post2009 region 
save ec0716_clean ,replace

